import json

def process_squad_data(input_file, output_file):
    try:
        with open(input_file, 'r', encoding='utf-8') as f:
            squad_data = json.load(f)
    except FileNotFoundError:
        print(f"无法找到语料文件'{input_file}'，请检查文件是否存在。")
        return

    train_datas = []

    for article in squad_data['data']:
        for paragraph in article['paragraphs']:
            context = paragraph['context'].strip()
            for qa in paragraph['qas']:
                question = qa['question'].strip()
                if len(qa['answers']) > 0:
                    answer = qa['answers'][0]['text'].strip()
                    combined_data = f"{context}\t{question}\t{answer}\t"
                    train_datas.append(combined_data)

                if len(qa['answers']) == 0:
                    answer = "No answer."
                    combined_data = f"{context}\t{question}\t{answer}\t"
                    train_datas.append(combined_data)

    with open(output_file, 'w', encoding='utf-8') as f:
        for train_data in train_datas:
            f.write(train_data + '\n')

    print(f"处理后的数据已保存至'{output_file}'。")

if __name__ == '__main__':
    squad_train_file = '/data/whl/cl/gpt2/dataset/train-v2.0.json'  
    processed_data_file = '/data/whl/cl/gpt2/dataset/processed_squad_data.txt' 
    process_squad_data(squad_train_file, processed_data_file)